## packages: remove or add your necessary packages
required_packages <- c("tidyverse", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools")
library(ggplot2) # CRAN v3.3.6
library(colorspace) # CRAN v2.0-3
library(here) # CRAN v1.0.1
library(dplyr) # CRAN v1.0.10
library(janitor) # CRAN v2.1.0
library(gt) # CRAN v0.5.0
library(tidyr) # CRAN v1.2.1
library(readr) # CRAN v2.1.3
library(stringr) # CRAN v1.4.1
library(tidytext)
library(ggalt)
library(forcats)
library(lubridate)
# for(i in required_packages) {
# if(!require(i, character.only = T)) {
#
# # if package is not existing, install then load the package
# install.packages(i, dependencies = T)
# require(i, character.only = T)
# }
# }
## save plots?
save <- TRUE
#save <- FALSE
## quality of png's
dpi <- 750
## font adjust; please adjust to client´s website
#extrafont::loadfonts(device = "win", quiet = TRUE)
#font_add_google("Montserrat", "Montserrat")
# font_add_google("Overpass", "Overpass")
# font_add_google("Overpass Mono", "Overpass Mono")
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 20,
face = "bold",
lineheight = 1.05,
hjust = .5,
margin = margin(10, 0, 25, 0)),
plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void(base_family = "Montserrat") +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)youtube_data <- read_csv(here("proc_data","youtube_data_proc.csv"))
youtube_data_activities <- read_csv(here("proc_data","youtube_data_activities_proc.csv"))
tiktok_data <- read_csv(here("proc_data","tiktok_data_proc.csv"))
tiktok_data_activities <- read_csv(here("proc_data","tiktok_data_activities_proc.csv"))yt_vids <- youtube_data %>% distinct(yt_video_id) %>% nrow()
tt_vids <- tiktok_data %>% distinct(tt_video_id) %>% nrow()youtube_data_activities %>% group_by(yt_video_id) %>% summarise(idn=max(idea)) %>%
pull(idn) %>% {length(which(.>1))}-> mult_ideas_yt
tiktok_data_activities %>% group_by(tt_video_id) %>% summarise(idn=max(idea)) %>%
pull(idn) %>% {length(which(.>1))} -> mult_ideas_tt
meanytlength <- youtube_data$video_length %>% summary %>% {./60}
meanttlength <- tiktok_data$video_meta_duration %>% summary YouTube: 177 videos (unique video url identifiers,
includes YT shorts)
TikTok: 177 videos
YouTube videos are longer (12.15141 minutes on average for the sampled videos), so approximately one third of the videos examined (53/177) included >1 money-making idea. TikTok videos have a shorter maximum length (3 to 10 minutes; 41.14384 seconds on average for the sampled videos) so videos on this platform tend to feature a single idea. Only 4 of the 145 TikTok videos examined provided more than one money-making idea.
youtube_data <- youtube_data %>% mutate(month=month(ymd(youtube_data$publish_date)),
pyear=year(ymd(youtube_data$publish_date))) %>%
mutate(pub_date=ymd(publish_date))
tiktok_data <- tiktok_data %>% mutate(month=month(ymd_hms(tiktok_data$create_time_iso)), pyear=year(ymd_hms(tiktok_data$create_time_iso))) %>%
mutate(pub_date=date(ymd_hms(create_time_iso))) 3/4 of the YouTube videos examined were published in 2022, and across all the videos sampled (published since 2018), most are from the summer/fall season (Northern Hemisphere).
TikTok videos in the sample were published between 2019-2022, with more videos uploaded with each passing year. The month with most uploads is July.
tiktok_data %>% tabyl(pyear) %>% round(2)## pyear n percent
## 2019 4 0.03
## 2020 29 0.20
## 2021 50 0.34
## 2022 63 0.43
Publication month also varied between platforms.
youtube_data %>% count(month) %>%
ggplot()+
geom_bar(aes(x=month,y=n),stat = "identity")+
scale_x_discrete(limits=month.abb) +labs(subtitle = "YouTube data")tiktok_data %>% count(month) %>%
ggplot()+
geom_bar(aes(x=month,y=n),stat = "identity")+
scale_x_discrete(limits=month.abb) +labs(subtitle = "TikTok data")Considering publication dates, videos published earlier do not tend to accumulate more views and comments over time. Engagement is also mostly unrelated to subscriber/follower counts and thus possibly related to content.
ttdatevc <- tiktok_data %>% select(source,pub_date,
comments=comment_count,
views=play_count,
followers=author_meta_fans)
ytdatevc <- youtube_data %>% select(source,pub_date,
comments=comments,
views=view_count,
followers=subs_numeric)
dates_views_comments <- bind_rows(ttdatevc,ytdatevc)
ggplot(dates_views_comments)+
geom_point(aes(x=pub_date,y=views,color=source))+
labs(x="Publication date")ggplot(dates_views_comments)+
geom_point(aes(x=pub_date,y=comments,color=source))+
labs(x="Publication date")ggplot(dates_views_comments)+
geom_point(aes(x=views,y=comments,color=source))ggplot(dates_views_comments)+
geom_point(aes(x=followers,y=comments,color=source))dates_views_comments %>% filter(followers!=44100000) %>%
ggplot()+
geom_point(aes(x=followers,y=comments,color=source))+
labs(subtitle = "removed outlier")ggplot(dates_views_comments)+
geom_point(aes(x=followers,y=views,color=source))dates_views_comments %>% filter(followers!=44100000) %>%
ggplot()+
geom_point(aes(x=followers,y=views,color=source))+
labs(subtitle = "removed outlier")yt_presenter_demog_gend <- youtube_data %>% tabyl(presenter_gender) %>%
mutate(valid_percent=round(valid_percent,2))
yt_malepct <- yt_presenter_demog_gend$valid_percent[2]
tt_presenter_demog_gend <- tiktok_data %>% tabyl(presenter_gender) %>%
mutate(valid_percent=round(valid_percent,2))
tt_malepct <- tt_presenter_demog_gend$valid_percent[2]
yt_ages <- youtube_data %>% tabyl(presenter_age) %>% na.omit() %>% select(-percent) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent)
tt_ages <- tiktok_data %>% tabyl(presenter_age) %>% na.omit() %>% select(-percent) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent)Male presenters were represented more on both platforms (YouTube: 0.86% and TikTok 0.8%), and the 20-30 y.o. age category had the highest proportion with ~40%.
gt(yt_ages) %>% tab_header("YouTube")| YouTube | ||
|---|---|---|
| presenter_age | n | percent |
| 10 - 20 | 6 | 0.03 |
| 20 - 30 | 76 | 0.43 |
| 30 - 40 | 49 | 0.28 |
| 40 - 50 | 4 | 0.02 |
| 50+ | 3 | 0.02 |
| Voice-over | 20 | 0.11 |
| Voice-over Text-to-Speech | 18 | 0.10 |
gt(tt_ages) %>% tab_header("TikTok")| TikTok | ||
|---|---|---|
| presenter_age | n | percent |
| 10 - 20 | 14 | 0.10 |
| 20 - 30 | 57 | 0.40 |
| 30 - 40 | 24 | 0.17 |
| 40 - 50 | 5 | 0.04 |
| 50+ | 2 | 0.01 |
| Music | 8 | 0.06 |
| Voice-over | 6 | 0.04 |
| Voice-over Text-to-Speech | 25 | 0.18 |
YouTube videos, as categorized by their authors, varied in assignment despite the similar overarching topic.
The most common category was Education, followed by How-to % Stlye, and then all the others.
youtube_data %>% tabyl(category) %>% arrange(-n) %>%
mutate(across(where(is.numeric),round,2)) %>% gt() %>% tab_header(title = "YouTube data")| YouTube data | ||
|---|---|---|
| category | n | percent |
| Education | 96 | 0.54 |
| Howto & Style | 46 | 0.26 |
| People & Blogs | 27 | 0.15 |
| Entertainment | 7 | 0.04 |
| News & Politics | 1 | 0.01 |
ytearn <-
youtube_data_activities %>%
group_by(yt_video_id,idea,earnings_timeframe) %>%
summarise(earn=mean(earnings,na.rm=TRUE)) %>% ungroup() %>% na.omit() %>%
filter(earnings_timeframe!="No timeframe provided")
ttearn <-
tiktok_data_activities %>%
group_by(tt_video_id,idea,earnings_timeframe) %>%
summarise(earn=mean(earnings,na.rm=TRUE)) %>% ungroup() %>% na.omit() %>%
filter(earnings_timeframe!="No timeframe provided")
# earnings time frames
earn_tf <- bind_rows(ytearn,ttearn) %>% count(earnings_timeframe) %>% arrange(-n)
earnings_by_tf <-
bind_rows(ytearn,ttearn) %>% group_by(earnings_timeframe) %>%
summarize(median_earn=median(earn),
min_earn=min(earn),max_earn=max(earn),
sd_earn=sd(earn,na.rm = TRUE)) %>% arrange(-median_earn)gt(earn_tf)| earnings_timeframe | n |
|---|---|
| Days | 69 |
| Months | 66 |
| Hours | 56 |
| One-time earnings | 31 |
| Minutes | 21 |
| Weeks | 9 |
| Years | 5 |
| Per Post | 2 |
gt(earnings_by_tf)| earnings_timeframe | median_earn | min_earn | max_earn | sd_earn |
|---|---|---|---|---|
| Years | 100000.000 | 500.000 | 400000.00 | 155449.21533 |
| Months | 5375.000 | 15.000 | 300000.00 | 50663.54170 |
| Weeks | 1050.000 | 24.000 | 14000.00 | 4357.91659 |
| Days | 500.000 | 5.000 | 7000.00 | 1324.43451 |
| Hours | 40.000 | 3.000 | 487.85 | 109.77971 |
| One-time earnings | 30.000 | 1.000 | 1225.00 | 271.99917 |
| Minutes | 26.000 | 0.042 | 400.00 | 152.51315 |
| Per Post | 16.895 | 0.500 | 33.29 | 23.18603 |
temporal_earn <- c("Days","Hours","Minutes","Months","Weeks","Years")
yt_tempearn <- ytearn %>% filter(earnings_timeframe %in% temporal_earn)
tt_tempearn <- ttearn %>% filter(earnings_timeframe %in% temporal_earn)
yt_hourly_earn <-
yt_tempearn %>% mutate(hourly_earn=case_when(
earnings_timeframe=="Hours"~earn,
earnings_timeframe=="Minutes"~earn/60,
earnings_timeframe=="Days"~earn/8,
earnings_timeframe=="Weeks"~earn/40,
earnings_timeframe=="Months"~earn/200,
earnings_timeframe=="Years"~earn/2400
)) %>% mutate(source="YouTube")
tt_hourly_earn <-
tt_tempearn %>% mutate(hourly_earn=case_when(
earnings_timeframe=="Hours"~earn,
earnings_timeframe=="Minutes"~earn/60,
earnings_timeframe=="Days"~earn/8,
earnings_timeframe=="Weeks"~earn/40,
earnings_timeframe=="Months"~earn/200,
earnings_timeframe=="Years"~earn/2400
)) %>% mutate(source="TikTok")
all_earn <- bind_rows(yt_hourly_earn,tt_hourly_earn)
hourly_med <- median(all_earn$hourly_earn)
bind_rows(yt_hourly_earn,tt_hourly_earn) %>%
ggplot()+
geom_histogram(aes(hourly_earn,fill=source),color="black",alpha=0.5)For videos that report earnings associated with a temporal reference ($ earned per unit of time), earnings can be reported in a common unit by assuming 8 hour work days and 5 day work weeks. The median hourly earnings is 37.5.
Across all videos, earnings are right-skewed. 90% of videos report hourly earnings > 275.
This distribution is also evident within earnings timeframes.
bind_rows(yt_hourly_earn,tt_hourly_earn) %>%
ggplot()+
geom_histogram(aes(earn))+
facet_wrap(~earnings_timeframe,scales = 'free')The more common categories (Education, Howto & Style) did not report the higher mean or median standardized earnings. Instead, the People and Blogs category and Entertainment had the top two positions.
yt_hourlycorrs <- left_join(yt_hourly_earn,youtube_data_activities)
yt_hourlycorrs_chp <- yt_hourlycorrs %>% group_by(yt_video_id,idea) %>%
chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()yt_hourlycorrs_chp %>%
group_by(category) %>% summarise(mean_earn=mean(hourly_earn),
med_earn=median(hourly_earn)) %>%
arrange(-mean_earn) %>% gt()| category | mean_earn | med_earn |
|---|---|---|
| People & Blogs | 206.80750 | 75.00 |
| Entertainment | 127.05000 | 125.00 |
| Howto & Style | 90.84293 | 43.75 |
| Education | 68.85482 | 37.50 |
yt_hourlycorrs_chp %>%
ggplot(aes(x=category,y=hourly_earn,color=category))+
geom_jitter() + scale_color_discrete(guide="none")For all YouTube videos, the predominant Business Type for the money-making ideas was Publication, Media, and Blogs, followed by the Service Business. Other business types were less common.
# without earnings
yt_acts_chp <- youtube_data_activities %>% group_by(yt_video_id,idea) %>%
chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()
yt_bus1 <- youtube_data_activities %>% group_by(yt_video_id,idea) %>%
unchop(business_type_level_1) %>% ungroup()
bus1ct <- yt_bus1 %>% group_by(yt_video_id,idea) %>%
distinct(yt_video_id,idea,business_type_level_1) %>%
tabyl(business_type_level_1) %>% arrange(-n)
bus1ct %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>%
gt() %>% tab_header("YouTube",subtitle = "Business Types, all videos") | YouTube | ||
|---|---|---|
| Business Types, all videos | ||
| business_type_level_1 | n | percent |
| Publication, Media & Blog | 175 | 0.45 |
| Service Business | 122 | 0.31 |
| Ecommerce & Consumer | 56 | 0.14 |
| Investing | 27 | 0.07 |
| Software & Tech | 9 | 0.02 |
For videos and ideas with reported earnings, the business activity with the highest earnings (standardized) was Publication, Media, & Blog, followed by investing.
# with earninings
yt_hourlycorrs_bus1 <- yt_hourlycorrs %>% group_by(yt_video_id,idea) %>%
unchop(business_type_level_1) %>% ungroup()
yt_hourlycorrs_bus1 %>% group_by(yt_video_id,idea) %>%
distinct(yt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup() %>%
group_by(business_type_level_1) %>%
summarise(mean_earn=mean(hourly_earn),
median_earn=median(hourly_earn)) %>%
arrange(-median_earn) %>% gt() %>% tab_header("YouTube",
subtitle = "standardized hourly earning by business types")| YouTube | ||
|---|---|---|
| standardized hourly earning by business types | ||
| business_type_level_1 | mean_earn | median_earn |
| Publication, Media & Blog | 123.72364 | 62.50000 |
| Investing | 31.20833 | 26.25000 |
| Software & Tech | 20.00000 | 20.00000 |
| Service Business | 35.67339 | 17.50000 |
| Ecommerce & Consumer | 48.28748 | 16.77083 |
However, there is considerable variation in earnings across the
different business types
yt_hourlycorrs_bus1 %>% group_by(yt_video_id,idea) %>%
distinct(yt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup %>%
ggplot()+
geom_jitter(aes(x=str_wrap(business_type_level_1,12),
y=hourly_earn,color=business_type_level_1))+
scale_color_discrete(guide="none")+labs(x="Business Type (level 1)")For TikTok videos, the predominant Business Type for the money-making ideas was Service Business with almost 50% of videos, followed by the Ecommerce & Consumer ventures. Other business types were less common.
# tt without earnings
tt_acts_chp <- tiktok_data_activities %>% group_by(tt_video_id,idea) %>%
chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()
tt_bus1 <- tiktok_data_activities %>% group_by(tt_video_id,idea) %>%
unchop(business_type_level_1) %>% ungroup()
tt_bus1ct <- tt_bus1 %>% group_by(tt_video_id,idea) %>%
distinct(tt_video_id,idea,business_type_level_1) %>%
tabyl(business_type_level_1) %>% arrange(-n)
tt_bus1ct %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>%
gt() %>% tab_header("TikTok",subtitle = "Business Types, all videos") | TikTok | ||
|---|---|---|
| Business Types, all videos | ||
| business_type_level_1 | n | percent |
| Service Business | 74 | 0.49 |
| Ecommerce & Consumer | 36 | 0.24 |
| Publication, Media & Blog | 28 | 0.19 |
| Investing | 12 | 0.08 |
| Software & Tech | 1 | 0.01 |
For videos and ideas with reported earnings, the business activity with the highest earnings (standardized) was Investing, followed by Ecommerce & Consumer
# tt with earninings
tt_hourlycorrs <- left_join(tt_hourly_earn,tiktok_data_activities)
tt_hourlycorrs_chp <- tt_hourlycorrs %>% group_by(tt_video_id,idea) %>%
chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()
tt_hourlycorrs_bus1 <- tt_hourlycorrs %>% group_by(tt_video_id,idea) %>%
unchop(business_type_level_1) %>% ungroup()
tt_hourlycorrs_bus1 %>% group_by(tt_video_id,idea) %>%
distinct(tt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup() %>%
group_by(business_type_level_1) %>%
summarise(mean_earn=mean(hourly_earn),
median_earn=median(hourly_earn)) %>%
arrange(-median_earn) %>% gt()| business_type_level_1 | mean_earn | median_earn |
|---|---|---|
| Investing | 230.37097 | 135.0000 |
| Ecommerce & Consumer | 215.47273 | 109.9006 |
| Publication, Media & Blog | 221.63810 | 31.2500 |
| Service Business | 58.37537 | 25.0000 |
| Software & Tech | 1.48500 | 1.4850 |
However, with some exceptions, earnings do not vary considerably across the different business types
tt_hourlycorrs_bus1 %>% group_by(tt_video_id,idea) %>%
distinct(tt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup %>%
ggplot()+
geom_jitter(aes(x=str_wrap(business_type_level_1,12),
y=hourly_earn,color=business_type_level_1))+
scale_color_discrete(guide="none")+labs(x="Business Type (level 1)",subtitle = "TikTok")# 1 saving plots in pdf with example
# ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color= drv)) +
# geom_smooth(mapping = aes(linetype = drv), method = 'loess') +
# geom_point()
#
# if(save == T){
# ggsave(here::here("plots", "name_plot.pdf"),
# width = 12.5, height = 8, device = cairo_pdf)
# }
# 2 pdfs will then be converted into the pngs using the 04_convert_pdfs_to_pngs.rmd file. View counts, comments, followers, and standardized earnings are not tightly associated.
yt_hourly_renamed <- yt_hourlycorrs_chp %>%
select(earn=hourly_earn,views=view_count,source,comments,followers=subs_numeric)
tt_hourly_renamed <- tt_hourlycorrs_chp %>%
select(earn=hourly_earn,views=play_count,source,comments=comment_count,
followers=author_meta_fans)
hourlyboth <- bind_rows(yt_hourly_renamed,tt_hourly_renamed)
ggplot(hourlyboth)+aes(x=views,y=earn,color=source)+geom_point()ggplot(hourlyboth)+aes(x=comments,y=earn,color=source)+geom_point()ggplot(hourlyboth)+aes(x=followers,y=earn,color=source)+geom_point()In general, the video titles vary considerably across platforms in terms of length, content and style.
tiktok_data <- tiktok_data %>% mutate(title_noHash=str_extract(text,"^[^#]*"))
yt_tlength <- round(mean(str_length(youtube_data$title)),0)
tt_tlength <-round(mean(str_length(tiktok_data$text)))
tt_tlength_nh <-round(mean(str_length(tiktok_data$title_noHash)))Without various trailing hashtags, YouTube video titles are on average, twice as long as TikTok titles (65 vs. 31 characters). Overall, roughly a third of the length of TikTok titles comprises various hashtags.
The words and bigrams (consecutive sequences of two words) that appear most frequently in the video’s titles vary significantly between platforms.
# tokenize
stopwords <- c("for","in","a","the","to","with","from","by")
title_words_yt <- youtube_data %>% unnest_tokens(title_wrd,title,token = "words") %>%
filter(!title_wrd %in% stopwords)
title_bigrams_yt <- youtube_data %>% unnest_tokens(title_bg,title,token = "ngrams",n=2)
title_words_tt <- tiktok_data %>% unnest_tokens(title_wrd,text,token = "tweets") %>%
filter(!title_wrd %in% stopwords)
title_bigrams_tt <- title_words_tt %>% mutate(nextwrdbg=lead(title_wrd)) %>%
unite(title_bg, title_wrd, nextwrdbg, sep = ' ')
wordsyt <- title_words_yt %>% count(title_wrd) %>% slice_max(n,n=15) %>% mutate(source="YouTube")
wordstt <- title_words_tt %>% count(title_wrd) %>% slice_max(n,n=15) %>% mutate(source="TikTok")
bg_yt <- title_bigrams_yt %>% count(title_bg) %>% slice_max(n,n=15) %>% mutate(source="YouTube")
bg_tt <- title_bigrams_tt %>% count(title_bg) %>% slice_max(n,n=15) %>% mutate(source="TikTok")
top15wrds <- bind_rows(wordsyt,wordstt)
top15bg <- bind_rows(bg_yt,bg_tt)
ggplot(top15wrds)+
geom_lollipop(aes(x=fct_reorder(title_wrd,n),y=n))+
facet_wrap(~source)+labs(x="word or hashtag",y='occurrences')+
coord_flip()ggplot(top15bg)+
geom_lollipop(aes(x=fct_reorder(title_bg,n),y=n))+
facet_wrap(~source)+labs(x="bigram",y='occurrences')+
coord_flip()Considering the top 15 words or bigrams, there is little overlap between platforms.